#!/bin/sh

# Generate the various necessary data files.  To generate everything, use
#
#  generate-data-files all
#
# Else, do one of the steps:
#
# article-data = Basic article data file
# coords = Article coordinates
# links = Article incoming links, only for articles with coordinates or
#         redirects to such articles
# combined = Combined article data file, only for articles with coordinates or
#            redirects to such articles

if [ -z "$TEXTGROUNDER_PYTHON" ]; then
  TEXTGROUNDER_PYTHON="$HOME/devel/textgrounder/python"
fi

. $TEXTGROUNDER_PYTHON/config-wikigrounder

PROCESSWIKI="$TEXTGROUNDER_PYTHON/processwiki.py"
GENERATE_COMBINED="$TEXTGROUNDER_PYTHON/generate_combined.py"

LOGFILE="generate-all-data.log"

OTHEROPTS="$MAXTIME $DEBUG"

if [ -z "$*" -o "$*" = "all" ]; then
  steps="article-data coords links combine"
else
  steps="$*"
fi

echo "Steps are $steps"

for step in $steps; do
echo "Executing step '$step' ..."

if [ "$step" = article-data ]; then
echo "Generating article data ..."
bzcat $IN_DUMP_FILE | $PROCESSWIKI \
  $DISAMBIG_ID_ARG \
  --split-training-dev-test foobar \
  --generate-article-data \
  $OTHEROPTS > $OUT_ORIG_ARTICLE_DATA_FILE

elif [ "$step" = coords ]; then
echo "Generating coordinate data ..."
bzcat $IN_DUMP_FILE | $PROCESSWIKI \
  --output-coords \
  $OTHEROPTS > $OUT_COORDS_FILE

elif [ "$step" = location-type ]; then
echo "Generating location-type data ..."
bzcat $IN_DUMP_FILE | $PROCESSWIKI \
  --output-location-type \
  $OTHEROPTS

elif [ "$step" = links ]; then
echo "Generating link data ..."
bzcat $IN_DUMP_FILE | $PROCESSWIKI \
  $COORDS_ARG $ORIG_ARTICLE_DATA_ARG \
  --find-links \
  $OTHEROPTS > $OUT_LINKS_FILE

elif [ "$step" = combine ]; then
echo "Combining data ..."
$GENERATE_COMBINED $LINKS_ARG $COORDS_ARG \
  $ORIG_ARTICLE_DATA_ARG > $OUT_COMBINED_ARTICLE_DATA_FILE

elif [ "$step" = counts ]; then
echo "Generating word count data ..."
bzcat $IN_DUMP_FILE | $PROCESSWIKI \
  --output-counts \
  $OTHEROPTS > $OUT_COUNTS_FILE

elif [ "$step" = toponym-eval ]; then
echo "Generating toponym eval data ..."
bzcat $IN_DUMP_FILE | $PROCESSWIKI \
  $COORDS_ARG $ORIG_ARTICLE_DATA_ARG \
  --generate-toponym-eval \
  $OTHEROPTS > $OUT_TOPONYM_EVAL_FILE

else
echo "Unrecognized step $step"

fi

done
